import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Wordcloud python library
from wordcloud import WordCloud
import plotly.express as px
cis_complaints = pd.read_excel('ftc_data/00612_redacted_covid_19_complaints.xlsx', sheet_name='CIS Complaints', header=0)
idt_complaints = pd.read_excel('ftc_data/00612_redacted_covid_19_complaints.xlsx', sheet_name='IDT Complaints', header=0)
dnc_complaints = pd.read_excel('ftc_data/00612_redacted_covid_19_complaints.xlsx', sheet_name='DNC Complaints', header=0)
list_complaints = []
list_complaints.extend(list(cis_complaints['Complaint Info Comments']))
list_complaints.extend(list(idt_complaints['Theft Activity Comments']))
list_complaints.extend(list(dnc_complaints['Other Information Comments']))
list_complaints = [str(i).lower() for i in list_complaints]
def clean_text_round(text):
'''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
text = text.lower()
text = re.sub('\[.*?\]', ' ', text)
text = re.sub('\w*\d\w*', ' ', text)
# text = re.sub('[‘’“”…]', '', text)
text = re.sub('<.*?>', ' ', text)
# text = re.sub('\\n', ' ', text)
text = re.sub('\n', ' ', text)
text = re.sub('\t', ' ', text)
text = re.sub('\(b\)\(6\)', ' ', text)
text = re.sub('"', ' ', text)
text = re.sub('---', ' ', text)
#text = re.sub('wa', ' ', text)
return text
stop_words = set(stopwords.words('english'))
your_list = ['additional','comments','br','gt']
for i, line in enumerate(list_complaints):
list_complaints[i] = ' '.join([str(x).lower() for
x in nltk.word_tokenize(line) if
( x not in stop_words ) and ( x not in your_list )])
list_complaints = [clean_text_round(text) for text in list_complaints]
list_complaints[0]
# Getting n-grams table
def ngrams_table(n, list_texts):
vectorizer = CountVectorizer(ngram_range = (n,n))
X1 = vectorizer.fit_transform(list_texts)
features = vectorizer.get_feature_names()
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (n,n))
X2 = vectorizer.fit_transform(list_texts)
# Getting top ranking features
sums1 = X1.sum(axis = 0)
sums2 = X2.sum(axis = 0)
data = []
for col, term in enumerate(features):
data.append( (term, sums1[0,col], sums2[0,col] ))
return pd.DataFrame(data, columns = ['term','rankCount', 'rankTFIDF']).sort_values('rankCount', ascending = False).reset_index(drop=True)
table_1grams = ngrams_table(1, list_complaints)
# Dfining wordcloud object
wc = WordCloud(background_color="white",
colormap="Dark2",
max_font_size=60,
random_state=42)
text_complaints = ' '.join([i for i in table_1grams['term']])
wc.generate(text_complaints)
plt.figure(figsize=(20,16))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Complaints')
plt.show()
table_1grams.head(20)
px.bar(table_1grams.head(20), 'term', 'rankCount')
table_2grams = ngrams_table(2, list_complaints)
# Dfining wordcloud object
wc = WordCloud(background_color="white",
colormap="Dark2",
max_font_size=60,
random_state=42)
text_complaints = ' '.join([i for i in table_2grams['term']])
wc.generate(text_complaints)
plt.figure(figsize=(20,16))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Complaints')
plt.show()
table_2grams.head(20)
px.bar(table_2grams.head(20), 'term', 'rankCount')
table_3grams = ngrams_table(3, list_complaints)
table_3grams.head(20)
px.bar(table_3grams.head(20), 'term', 'rankCount')